// vim: set syntax=asm :

{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_min", op:"vpminsd", from:from, to:to, type:"i32" %}
{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_max", op:"vpmaxsd", from:from, to:to, type:"i32" %}
{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_mul", op:"vpmulld", from:from, to:to, type:"i32" %}
{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_add", op:"vpaddd", from:from, to:to, type:"i32" %}
{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_sub", op:"vpsubd", from:from, to:to, type:"i32" %}
{% include "fma_mmm_ymm_scalar.tmpliq" label:"scalar_sub_flipped", op:"vpsubd", from:from, to:to, flipped: true, type:"i32" %}

{{L}}leaky_relu:
    // can only use ymm12 to ymm15
    // ymm15 <- alpha
    vbroadcastss    ymm15, dword ptr [rdi + 8]
    // ymm14 <- all zero
    vpxor          ymm14, ymm14, ymm14

    {% for reg in (from..to) %}
        vpmulld     ymm12, ymm{{reg}}, ymm15
        vpcmpgtd    ymm13, ymm14, ymm{{reg}}
        vblendvps   ymm{{reg}}, ymm{{reg}}, ymm12, ymm13
    {% endfor %}

    jmp    {{L}}non_linear_loop
